Chapter 4: Clustering

1 Create a new R Markdown file and save it as an empty file named ‘chapter4.Rmd’. Then include the file as a child file in your ‘index.Rmd’ file.

1.1 Create a new R Markdown file and save it as an empty file named ‘chapter4.Rmd’

Done!

1.2 include the file as a child file in your ‘index.Rmd’ file.

Done!

2 Load the Boston data from the MASS package. Explore the structure and the dimensions of the data and describe the dataset briefly, assuming the reader has no previous knowledge of it. (0-1 points)

2.1 Load the Boston data from the MASS package.

# access the MASS package
library(MASS)

# load the data
data("Boston")

# pass Boston to another object for easy typing
bos <- Boston

2.2 Explore the structure and the dimensions of the data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
#explore structure
str(bos)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#explore dimensions
dim(bos)
## [1] 506  14
#generate a codebook
# string is copy from dataset introduction
codebook <- data.frame(variable = "CRIM - per capita crime rate by town/ZN - proportion of residential land zoned for lots over 25,000 sq.ft./INDUS - proportion of non-retail business acres per town./CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)/NOX - nitric oxides concentration (parts per 10 million)/RM - average number of rooms per dwelling/AGE - proportion of owner-occupied units built prior to 1940/DIS - weighted distances to five Boston employment centres/RAD - index of accessibility to radial highways/TAX - full-value property-tax rate per $10,000/PTRATIO - pupil-teacher ratio by town/B - 1000(Bk-0.63)^2 where Bk is the proportion of blacks by town/LSTAT - % lower status of the population/MEDV - Median value of owner-occupied homes in $1000's") 

codebook <- codebook %>% 
  separate_rows(variable, sep = "/") %>%  # "/" is the delimiter for rows
  separate(variable, sep = " - ",      #" - " is the delimiter for variables
           into = c("name", "description"),  # names of sparated variables
           remove = T)  #remove old column
#check codebook
library(DT)
codebook %>% datatable 

The data set has 506 observations of 14 variables. Variable CHAS is a dummy variable where 1 means the the place having tract that bounds river, and 0 means otherwise. It needs to be converted to a factor.

bos <- bos %>% 
  mutate(chas = chas %>%
           factor() %>% 
           fct_recode(
             "With tracts that bonds river" = "1", #old value 1 to new label
             "Otherwise" = "0") # old value 0 to new label
)

2.3 describe the dataset

Each of the 506 rows in the dataset describes a Boston suburb or town, and it has 14 columns with information such as average number of rooms per dwelling, pupil-teacher ratio, and per capita crime rate. The last row describes the median price of owner-occupied homes.

3 Show a graphical overview of the data and show summaries of the variables in the data. Describe and interpret the outputs, commenting on the distributions of the variables and the relationships between them. (0-2 points)

3.1 Show a graphical overview of the data

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)

my.fun <- function(data, mapping, method = "lm",...){
  p <- ggplot(data = data, mapping = mapping) +
    geom_point(size = 0.3, color = "blue",...) +
    geom_smooth(size = 0.5, color = "red", method = method)
  p
}

names1 <- pull(codebook[1:7,], description)
names1 <- sapply(names1, function(x) paste(strwrap(x, 35), collapse = "\n"))

ggpairs(bos, 
        lower = list(
          continuous = my.fun,
          combo = wrap("facethist", bins = 20)),
        col = 1:7,
        columnLabels = names1)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Visualized relations of Boston dataset, variable #1~#7

Visualized relations of Boston dataset, variable #1~#7

names2 <- pull(codebook[8:14,], description)
names2 <- sapply(names1, function(x) paste(strwrap(x, 35), collapse = "\n"))

ggpairs(bos, 
        lower = list(
          continuous = my.fun),
        col = 8:14,
        columnLabels = names2,
        )
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Visualized relations of Boston dataset, variable #8~#14

Visualized relations of Boston dataset, variable #8~#14

3.2 Show summaries of the variables in the data.

library(finalfit)
#summarise the coninuous data
ff_glimpse(bos)$Continuous %>% datatable
# summarise the categorical data
ff_glimpse(bos)$Categorical %>% datatable
bos %>% ggplot(aes(x= crim)) +
  geom_boxplot()

3.3 Describe and interpret the outputs, commenting on the distributions of the variables and the relationships between them.

3.3.1 interpreting continuous variables

There are 13 continuous variables in the dataset. The crime rate of the town was 0.3(0.1~3.7)%; the proportion of a town’s residential land zoned for lots over 25,000 sq.ft. was 0 (0~12.5)%; the proportion of non-retail business acres per town was 9.7(5.2~18.1)%; the nitric oxides concentration was 0.5(0.4~0.6) parts per 10 million; the average number of rooms per dwelling was 6.3±0.7 rooms; the proportion of owner-occupied units built prior to 1940 was 77.5(45.0~94.1)%; the weighted distances to five Boston employment centres was 3.2 (2.1~5.2) kilometers; the index of accessibility to radial highways was 5(4~24) units of accessibility; the full-value property-tax rate was $330(279~666) per $10,000; the pupil-teacher ratio by town was 19.1(17.4~20.2); the Black proportion of population after taking the formula of 1000(Bk-0.63)^2 was 391.4(375.4~396.2); the proportion of population that is lower status was 11.4(6.9~17.0)%; the median value of owner-occupied homes was $21.2(17~25)*1000. #### 3.3.2 interpreting categorical variable

35(6.9%) towns have tracts that bonds Charles River.

3.3.3 commenting on the relationships between variables

to be supplemented.

4 Standardize the dataset and print out summaries of the scaled data. How did the variables change? Create a categorical variable of the crime rate in the Boston dataset (from the scaled crime rate). Use the quantiles as the break points in the categorical variable. Drop the old crime rate variable from the dataset. Divide the dataset to train and test sets, so that 80% of the data belongs to the train set. (0-2 points)

4.1 Standardize the dataset and print out summaries of the scaled data

library(MASS)
bos.s <- as.data.frame(scale(bos[-4])) # there is no need to scale categorical data, so it was held out for scaling
 
                                       # bos.s means Boston Scaled
bos.s$chas <- bos$chas # add categorical variable back for further analysis

4.2 How did the variables change?

ff_glimpse(bos.s)$Con %>% datatable

All the numeric variables after scaling had a mean of 0 and most of variables’ values ranged from -4 and 4, only except for variables crim, which might be due to outliers.

4.3 Use the quantiles as the break points in the categorical variable and drop the old crime rate variable from the dataset.

#generate cutoff according to quantile
bins <- quantile(bos.s$crim)
#generate a categorical variable "crime" and re-code it
bos.s <- bos.s %>% 
  mutate(crime = crim %>% 
           cut(breaks = bins, include.lowest = TRUE) %>% 
           fct_recode("Low" = "[-0.419,-0.411]",
                     "MediumLow" = "(-0.411,-0.39]",
                     "MediumHigh" = "(-0.39,0.00739]",
                     "High" = "(0.00739,9.92]"))
#remove crim
bos.s <- bos.s %>% select(-crim)

4.4 Divide the dataset to train and test sets, so that 80% of the data belongs to the train set

set.seed(2022)
#generate an object containing the number of obs.
n <-  nrow(bos.s)

#
ind <- sample(1:n, size = n*0.8)
#
train <- bos.s[ind,]
test <- bos.s[-ind,]

5 Fit the linear discriminant analysis on the train set. Use the categorical crime rate as the target variable and all the other variables in the dataset as predictor variables. Draw the LDA (bi)plot (0-3 points)

5.1 Fit the linear discriminant analysis on the train set (Use the categorical crime rate as the target variable and all the other variables in the dataset as predictor variables)

lda.fit <- lda(crime ~ ., data = train) 

5.2 Draw the LDA (bi)plot

# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}
# target classes as numeric
classes <- as.numeric(factor(train$crime))
#plot the lda results
plot(lda.fit, dimen = 2,  pch = classes, col = classes)+
lda.arrows(lda.fit, myscale = 4)

## integer(0)

6 Save the crime categories from the test set and then remove the categorical crime variable from the test dataset. Then predict the classes with the LDA model on the test data. Cross tabulate the results with the crime categories from the test set. Comment on the results. (0-3 points)

6.1 Save the crime categories from the test set and then remove the categorical crime variable from the test dataset

#save
classes.test <- test$crime
#remove old
test$crime <- NULL

6.2 predict the classes with the LDA model on the test data

predicted.test <- predict(lda.fit, test)

6.3 Cross tabulate the results with the crime categories from the test set.

a <- table(correct = classes.test, predicted = predicted.test$class )
a
##             predicted
## correct      Low MediumLow MediumHigh High
##   Low          4         7          0    0
##   MediumLow   10        16          4    0
##   MediumHigh   2        11         17    1
##   High         0         0          0   30
correct.n = 0
for (i in 1:4){
  correct.c <- a[which(rownames(a) == colnames(a)[i]), i]
  correct.n = correct.c+ correct.n
} 
correct.n/(nrow(bos.s)*0.2)
## [1] 0.6620553

6.4 Comment on the results

7 Reload the Boston dataset and standardize the dataset (we did not do this in the Exercise Set, but you should scale the variables to get comparable distances). Calculate the distances between the observations. Run k-means algorithm on the dataset. Investigate what is the optimal number of clusters and run the algorithm again. Visualize the clusters (for example with the pairs() or ggpairs() functions, where the clusters are separated with colors) and interpret the results. (0-4 points)

7.1 Reload the Boston dataset and standardize the dataset

#reload Boston
data("Boston")
bos <- Boston
#standardize the dataset
bos.s <- scale(bos)

7.2 Calculate the distances between the observations

dis_eu <- dist(bos)
summary(dis_eu)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.119  85.624 170.539 226.315 371.950 626.047

7.3 Run k-means algorithm on the dataset

bos.s.km <- kmeans(bos.s, centers = 4) 

7.4 Investigate what is the optimal number of clusters

set.seed(22)

# determine the number of clusters
k_max <- 10

# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(bos.s, k)$tot.withinss})

# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')

7.5 run the algorithm again

km <- kmeans(bos.s, centers = 3)

7.6 Visualize the clusters

ggpairs(bos, 
        lower = list(
          continuous = wrap("points", size = 0.3, color = factor(km$cluster)),
          combo = wrap("facethist", bins = 20)
          ),
        col = 1:7)

ggpairs(bos[8:14],
        lower = list(
          continuous = wrap("points", size = 0.3, color = factor(km$cluster),
          combo = wrap("facethist", bins = 20))))
## Warning: Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo
## Ignoring unknown parameters: combo

bos %>% ggplot(aes(x = black, y = medv, color = factor(km$cluster))) +
  geom_point() +
  geom_abline(intercept = 40, slope = -0.1)+
  geom_abline(intercept = 30, slope = -0.01) +
  stat_ellipse(geom = "polygon",
               aes(fill = km$cluster),
               alpha = 0.25)

bos %>% ggplot(aes(x = nox, y = age, color = factor(km$cluster))) +
  geom_point() +
  geom_abline(intercept = 140, slope = -120)+
  geom_abline(intercept = 230, slope = -250) +
  stat_ellipse(geom = "polygon",
               aes(fill = km$cluster),
               alpha = 0.25)

bos %>% ggplot(aes(x = dis, y = black, color = factor(km$cluster))) +
  geom_point() +
  geom_abline(intercept = 480, slope = -25)+
  geom_abline(intercept = 400, slope = -80) +
  stat_ellipse(geom = "polygon",
               aes(fill = km$cluster),
               alpha = 0.25)

7.7 interpret the results

8 Bonus: Perform k-means on the original Boston data with some reasonable number of clusters (> 2). Remember to standardize the dataset. Then perform LDA using the clusters as target classes. Include all the variables in the Boston data in the LDA model. Visualize the results with a biplot (include arrows representing the relationships of the original variables to the LDA solution). Interpret the results. Which variables are the most influential linear separators for the clusters? (0-2 points to compensate any loss of points from the above exercises)

8.1 Perform k-means on the original Boston data with some reasonable number of clusters (> 2)(Remember to standardize the dataset)

km <- kmeans(scale(Boston), centers = 3)

8.2 Perform LDA using the clusters as target classes.

bos.s  <- as.data.frame(scale(Boston))
bos.s$km.cluster <- km$cluster
lda.km <- lda(km.cluster ~ ., data = bos.s) 

8.3 Visualize the results with a biplot (include arrows representing the relationships of the original variables to the LDA solution)

# target classes as numeric
classes <- as.numeric(factor(bos.s$km.cluster))
#plot the lda results
plot(lda.km, dimen = 2,  pch = classes, col = classes)+
lda.arrows(lda.km, myscale = 4)

## integer(0)

8.4 Interpret the results

9 Super-Bonus: Run the code below for the (scaled) train data that you used to fit the LDA. The code creates a matrix product, which is a projection of the data points.

bos.s <- as.data.frame(scale(Boston))
#generate cutoff according to quantile
bins <- quantile(bos.s$crim)
#generate a categorical variable "crime" and re-code it
bos.s <- bos.s %>% 
  mutate(crime = crim %>% 
           cut(breaks = bins, include.lowest = TRUE) %>% 
           fct_recode("Low" = "[-0.419,-0.411]",
                     "MediumLow" = "(-0.411,-0.39]",
                     "MediumHigh" = "(-0.39,0.00739]",
                     "High" = "(0.00739,9.92]"))
#remove crim
bos.s <- bos.s %>% select(-crim)
set.seed(2022)
#generate an object containing the number of obs.
n <-  nrow(bos.s)
#
ind <- sample(1:n, size = n*0.8)
#
train <- bos.s[ind,]
test <- bos.s[-ind,]
model_predictors <- dplyr::select(train, -crime)

# check the dimensions
dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3
str(model_predictors)
## 'data.frame':    404 obs. of  13 variables:
##  $ zn     : num  -0.487 -0.487 -0.487 -0.487 -0.487 ...
##  $ indus  : num  -0.7196 1.015 -0.0797 -0.1803 -1.1511 ...
##  $ chas   : num  -0.272 -0.272 -0.272 -0.272 -0.272 ...
##  $ nox    : num  -0.4375 1.3661 -0.5669 -0.0923 -0.8172 ...
##  $ rm     : num  1.25 -0.109 -0.56 -1.867 -0.2 ...
##  $ age    : num  0.402 0.939 -1.644 -1.093 -1.292 ...
##  $ dis    : num  -0.2751 -0.7469 0.0714 -0.6058 0.9871 ...
##  $ rad    : num  -0.178 1.66 -0.637 -0.637 -0.637 ...
##  $ tax    : num  -0.601 1.529 -0.779 -0.618 0.129 ...
##  $ ptratio: num  -0.4876 0.8058 0.0667 -0.0257 -0.7185 ...
##  $ black  : num  0.1687 -2.8046 0.4406 -0.0682 0.1303 ...
##  $ lstat  : num  -0.88125 0.35246 -0.24969 -0.00183 -0.49895 ...
##  $ medv   : num  0.98587 -1.17785 0.00731 -0.69944 -0.29714 ...
str(lda.fit$scaling)
##  num [1:13, 1:3] 0.0862 -0.0208 0.4024 -0.1155 0.2563 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:13] "zn" "indus" "nox" "rm" ...
##   ..$ : chr [1:3] "LD1" "LD2" "LD3"
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
as.matrix(model_predictors) %*% lda.fit$scaling
##             LD1         LD2          LD3
## 228 -1.14003053 -1.06380834  1.176797380
## 435 -2.52084943 -0.81709800  0.402381513
## 206  0.01498784  0.42819764  0.757184324
## 311 -2.26434057  0.48160729  0.677912129
## 331  3.15176185  1.60723749 -0.300913367
## 196  2.51878869  1.66407861 -2.356875243
## 262 -2.90725169 -1.65843136  0.323960426
## 191  3.86954230  2.14188542 -1.579166391
## 504 -2.81287613 -1.29776557  0.539263007
## 476 -1.91153258 -1.20778908  1.916110767
## 123 -2.92322169 -1.41236818  0.566274735
## 233 -0.25364273 -1.70822625  1.195679048
## 270  1.64700326 -1.51127399 -5.472023975
## 248  6.20029559  2.74665868 -0.912915654
## 7    2.54856697  1.28312419 -0.313999554
## 349  6.96575880  4.49143339 -3.626055180
## 439 -2.73599958 -1.27307377  0.072562197
## 112 -1.63425095 -0.86491112  0.680325470
## 470 -1.34111835 -0.39715169  2.927049290
## 453 -1.83535544 -1.09019202  2.379955444
## 1    0.36738817  0.76119242 -0.776505449
## 307 -1.16603668 -0.37199137  0.143016485
## 288  5.36601577  3.42244681 -1.932651594
## 449 -1.92864870 -1.06505046  2.423769725
## 354 12.25562958  6.27574298 -4.372718690
## 378 -2.88616563 -1.97035414  2.394499183
## 359 -0.20066843 -3.12556362 -2.583875869
## 157 -3.86684766 -0.47286473 -1.854035599
## 12   3.65751219  1.63620753 -0.639488618
## 463 -1.45294101 -0.89923913  2.547017545
## 316  0.03252007  0.53298999  0.421513870
## 428 -3.11041755 -0.84356316 -0.045505468
## 454 -1.43305200 -1.83165661  1.986845936
## 447 -2.36562562 -1.23615665  1.901274257
## 480 -2.66921101 -1.27459093  2.883403380
## 150 -3.51602432 -1.11514336 -0.140454913
## 197  5.57024031  3.38466431 -3.760732049
## 11   4.23885368  1.30666814 -1.054851960
## 107 -2.52915649 -0.69185686  1.096997341
## 276  0.46759717  0.97337894 -0.739310928
## 62   4.47852316  2.32515546 -0.674010806
## 72   2.05030755  1.12787135  0.362890197
## 3    1.24862049 -0.17073174  0.448467802
## 452 -1.75500199 -1.33350167  2.015039999
## 285  5.50539568  3.70972876 -4.075516333
## 194  3.68362650  2.73138030 -2.511392127
## 290  5.45620503  3.10983059 -2.169215454
## 465 -1.14240508 -0.68140063  2.696210709
## 256  8.28255396  5.48583083 -4.149237820
## 127 -3.18057685 -1.48720703  0.358774592
## 234 -0.80183408 -1.98331418  1.525618749
## 423 -3.01630507 -0.87633538  2.464629420
## 353 10.42196606  5.87181987 -3.408385527
## 287  8.05898987  5.07069990 -4.401672650
## 56   7.26929455  4.21727015 -3.467675035
## 486  0.44443718 -0.10635599  2.469947846
## 382 -2.64775897 -1.64957852  2.337880390
## 19  -0.65816354  0.76330109  0.217402320
## 497 -1.68802964 -0.16312736  1.037804782
## 306 -0.77317819  0.42789123  0.097624384
## 425 -2.93703686 -0.40453822  0.180522495
## 361 -2.21311378 -1.05130717  2.650237427
## 493 -2.65725513 -0.93342755  0.221422758
## 464 -1.41772886 -0.93069404  2.499685014
## 32   0.37022503  0.37899059  0.224421296
## 120 -1.61684211 -0.10644873  0.802431702
## 417 -2.61082823 -1.53923380 -0.458324059
## 99  -0.93481333 -1.32612755  0.997787488
## 218 -0.88153295 -0.85801812  0.766345041
## 467 -1.98839852 -0.56735055  0.242218661
## 438 -2.84898008 -1.17610723 -0.348148202
## 400 -2.62517311 -1.22354908  1.904675400
## 87   0.77740452  0.57962278  0.594563076
## 319 -0.60128110 -0.24667791  0.586109649
## 351  7.30992438  3.82926197 -2.346088527
## 389 -2.99749213 -0.80181392  2.646083495
## 336  2.89669695  1.50208619  0.533171300
## 42   2.72487097  0.93853241  0.153966870
## 243  4.92536865  2.42238514 -1.269449992
## 461 -1.63581826 -1.17630282  1.356443820
## 344  3.03888995  2.39472332 -2.076122984
## 24   0.34172831  0.28960793  0.408871264
## 335  3.65044772  1.58898518  0.270946119
## 146 -3.64546871 -1.62528049 -1.589359083
## 481 -0.45077187 -0.45799959  2.826302166
## 488 -0.89240592 -0.25843426  2.767736264
## 237  1.98602800 -2.47033154 -4.349764322
## 110 -1.96708020 -0.68215253  0.865632420
## 6    2.82988498  1.12321979  0.473540614
## 289  5.40950062  3.30526051 -2.011633586
## 247  6.13826900  2.92188956 -0.608930329
## 279  0.38650941  1.16333335 -0.742977304
## 104 -1.77653679 -0.45578715  0.890345960
## 282  1.91915386  0.73436272 -0.040691230
## 207  0.60865091  0.14880302  0.590064974
## 33   0.19078098  0.07801734 -0.788695194
## 95  -0.28969455  0.48666194 -0.532542479
## 140 -3.29835305 -1.43334424  0.612804130
## 366 -4.67453524  0.30629137  3.945198823
## 313 -1.63868907 -0.37425756  0.703481877
## 267 -2.42836183 -1.14563083 -0.152749359
## 329  2.17337091  1.48223596  0.003809728
## 35  -0.27577041  0.02260238 -0.622622375
## 291  2.00010088  2.65241270 -2.545469615
## 403 -2.65543941 -1.51087205  2.262565420
## 224 -1.12484684 -0.68392283  1.442959749
## 239  3.68210560  2.18800076 -0.995635885
## 102 -1.70366245 -0.78853778  1.001752295
## 16   0.60237688  0.84339924  0.631255218
## 53   4.44256410  2.17296306 -0.780839951
## 383 -2.91995135 -1.05713207  2.660716773
## 177 -0.49726987  0.22645235  0.804996801
## 459 -1.40119524 -0.75001338  1.500346270
## 337  1.80228144  1.19254004  0.678102485
## 348  7.28588009  4.83090183 -3.934627248
## 373 -3.80191517 -4.54279542 -1.015266798
## 15   0.63938408  0.53603691  0.360627903
## 114 -1.77204471 -0.66699103  0.712496168
## 187 -1.71562902 -1.83638517  1.529850250
## 309 -0.97772444 -0.39086251  0.580472975
## 221  0.13223857 -3.40809567 -4.083504323
## 200  5.82804956  4.22473454 -3.931762434
## 81   2.22686364  1.36324465 -0.443203757
## 365 -0.55713955 -5.01494711 -3.574000497
## 496 -1.67908163 -0.14968571  1.078908376
## 201  5.94472762  4.19860957 -4.166462406
## 295  2.49507988  0.96162266  0.179209392
## 130 -2.88137167 -0.81823231  0.566611218
## 396 -2.57874709 -1.44511333  2.389260017
## 29   0.80770796  0.14075053  0.264580591
## 310 -1.29927816 -0.12600958  0.711261178
## 443 -2.53334827 -1.30514804  2.639918167
## 260 -2.84455573 -0.89632834  0.015847095
## 303  2.67761435  1.76257483 -0.830487428
## 38  -0.06850574  0.59980908  0.912946016
## 69   4.01861754  2.38888550 -0.398115108
## 46   1.76692031  1.27679382  0.395685865
## 20  -0.38036426  0.48673442  0.701243753
## 77   0.36621089  0.06245796  0.342499195
## 26   0.60190476  0.81883006 -0.196374960
## 278  2.93018621 -1.04778216 -6.133839931
## 34  -0.18154723  0.34089047  0.226994228
## 499 -2.24392457 -0.51808894  1.013996155
## 315 -0.54139004 -0.41074476  0.553481948
## 193  3.94019818  2.05790170 -1.591290474
## 430 -2.59484929 -1.25049884  0.017601490
## 368 -4.79288155  0.02264942  2.270820582
## 301  6.41140527  3.75763746 -3.264548794
## 113 -2.14138575 -0.60808280  0.817596749
## 259 -3.07386853 -1.48720703  0.078569151
## 314 -1.07881671 -0.24341171  0.626701481
## 456 -2.20098071 -1.03743718  0.007559974
## 242  3.92590585  2.22745838 -0.979244488
## 160 -3.44711534 -1.43553010  0.018281193
## 263 -2.64038698 -2.30904762  0.238736672
## 269 -1.77658429 -1.00086776  0.314385348
## 5    2.80668042  0.42954389  0.581065710
## 223  0.70695276 -3.14863583 -4.123657917
## 151 -3.56217667 -1.39670368  0.119180674
## 305  0.09915639  0.22653790  0.099425812
## 79   1.80023413  0.59266545  0.277318521
## 63   4.94541596  2.34935701 -0.487968937
## 179 -1.55126071 -0.86324596  0.972550804
## 257  3.54990812  2.63814445 -3.096846981
## 498 -1.49085585 -0.10561507  0.847247018
## 502 -2.29557626 -0.87866927  0.474364434
## 168 -2.57662933 -0.74760164 -0.468553943
## 230 -0.99438846 -0.25052879  1.409364477
## 139 -3.17108330 -1.18606827  0.458141587
## 122 -2.79176717 -1.22754345  0.534164395
## 173 -2.07297564 -0.26534778  1.185123130
## 210  1.43881510 -2.21003768 -4.489534190
## 78   0.41254717  0.32046493  0.509490760
## 302  2.86943542  1.71605915 -0.991358833
## 163 -1.87635393 -5.16850503 -4.015476188
## 131 -2.71878153 -1.28256969  0.493197927
## 416 -2.64873476 -1.43973653 -0.329038483
## 421 -2.77179449 -1.37269531  2.058699120
## 500 -2.24983267 -0.21394389  0.987669452
## 181 -2.00737356 -1.91364508  1.183308545
## 55   5.44685669  4.28307256 -3.296814150
## 133 -2.64718817 -1.19483599  0.606855604
## 358 -0.37183876 -3.46056397 -2.716466735
## 413 -3.95513960 -0.91392531  0.976593312
## 450 -2.06849734 -1.22279285  1.693625845
## 83   2.23698876  1.70419767 -0.448437603
## 293  1.98297432  2.79911959 -2.499683864
## 341  1.21205398  0.91475466  0.704098455
## 67   4.54730919  4.23208944 -3.270895559
## 145 -3.63537451 -0.90604808  0.226908322
## 483 -0.37299700 -0.99928633  2.641893017
## 343  3.91970938  1.60529112 -1.045494448
## 91  -1.18371265 -0.23161719  0.503007989
## 236 -0.42779235  0.03049503  1.106881406
## 334  3.57018268  1.57186523  0.348285353
## 136 -2.63497850 -1.30394404  0.457481057
## 401 -2.43537230 -1.26270200  2.213136842
## 231 -0.44622935  0.04494985  1.166769166
## 121 -2.75482642 -1.10968006  0.718900361
## 166 -2.81778714 -0.99245892 -0.360943685
## 355  9.97374252  6.43564155 -3.831759469
## 111 -1.67601005 -0.38167805  0.952920388
## 48   2.89055900  0.92619543 -0.051811812
## 338  2.38707072  1.31102904  0.523236538
## 18   0.44757738  0.40168087  0.425863315
## 175 -1.85137223 -0.13926869  1.034330204
## 266 -2.86359985  0.21444616  0.112238111
## 240  3.70515723  1.98246156 -1.029618654
## 346  6.19532691  2.60609207 -0.600369133
## 189  1.19911580  1.69288416 -1.313494214
## 132 -2.56045249 -1.12081584  0.540609572
## 205  1.81229127  1.62849534 -2.876276349
## 308 -1.01260075  0.18157665  0.080606731
## 277  2.94625697 -1.52631945 -6.305333274
## 326  2.13921495  1.01967606  0.549371802
## 283  3.16923541 -2.38584264 -5.154476087
## 148 -3.57226045 -0.95260992  0.289375503
## 398 -2.72785848 -0.95802306  2.446682295
## 134 -2.35451581 -0.73205401  0.552628592
## 61   4.96662247  2.73464561 -0.444843645
## 58   6.84508639  4.66619303 -4.067601202
## 176 -1.24231020 -0.30714487  1.009026553
## 273  0.02516274  0.48450096 -0.173077524
## 441 -2.42724431 -0.99396754  2.378507238
## 105 -2.26258682 -0.61849338  0.978310173
## 380 -2.92504355 -1.50610551  2.427267061
## 399 -2.66843047 -1.06271111  2.377638293
## 138 -2.98493246 -1.40128451  0.435553508
## 92  -1.19712020 -0.22014746  0.494762260
## 208  0.57091680  0.25509652  0.625781083
## 397 -2.58342513 -1.46599183  2.412439293
## 188  0.08685456  0.97795580 -1.058325447
## 37  -0.88388151  0.20665771  0.856274078
## 436 -2.41221151 -1.44635753  0.324723239
## 59   5.75180825  2.96728060 -0.507445177
## 318  0.15455832  0.29580392  0.516839862
## 238  0.31985840 -0.65743309  1.015956562
## 457 -2.19785735 -0.58563828 -0.157573897
## 479 -2.01394106 -1.11222253  2.478322216
## 342  4.76050824  1.98364951 -1.815609377
## 172 -2.25424482 -0.73090851  0.107335902
## 31   0.58772343  0.43094655  0.087162591
## 80   0.95807533  0.78211175  0.555671630
## 220  0.75062496 -2.90587282 -4.677968649
## 268 -2.35987105 -2.16092680  0.358511441
## 235  1.07746687 -2.79690887 -4.268853061
## 332  4.39026957  3.06363302 -1.922149126
## 101 -1.86770240 -0.91546215  1.072630173
## 170 -2.42392262 -1.19751374 -0.018550297
## 312 -2.00836336 -0.27458122  0.864201908
## 472 -0.82498270 -0.67645314  2.727782692
## 333  4.26778811  2.95433599 -2.111459197
## 294  2.47904337  0.97566109  0.242796966
## 135 -2.65470583 -0.70287793 -0.360566046
## 369 -5.47930209 -1.41487283  4.826389415
## 429 -2.70440737 -1.04746326  0.408040594
## 161 -1.86541177 -3.60388298 -4.834865036
## 8    3.06669425  0.89106603 -0.283735898
## 39  -0.24874755  0.37137091  1.029006029
## 202  3.96165298  3.86563690 -3.595153825
## 473 -1.16620851 -1.00031863  2.778840011
## 47   1.86259346  1.06543087  0.365365418
## 491 -2.75877440 -0.65424219 -0.565370420
## 217  0.36363669 -2.65455694 -4.469679466
## 180 -2.03780769 -1.08849672  1.344010638
## 466 -1.14880713 -0.22312608  2.348921192
## 444 -2.32163445 -1.46225550  2.334076576
## 73   2.00556670  1.19911145  0.491872619
## 272  0.73476976  1.11996636 -0.134583291
## 503 -2.69918923 -0.62093278  0.642967667
## 484  0.46501620  0.33596922  2.763991633
## 129 -3.06291219 -1.46093725  0.508057982
## 299  6.32460841  4.33591904 -3.330740160
## 255  8.22641761  5.33044293 -4.179948860
## 474 -1.87641467 -1.61704640  2.822822536
## 96  -0.80772849 -0.22796139  0.464632684
## 446 -2.72676403 -1.35136409 -0.097122211
## 275  1.76395454 -1.36327763 -5.940672448
## 49   3.25235267  1.07497994 -0.010655066
## 407 -4.06010179 -0.34446049  3.168205048
## 25   0.72204473  0.46621445  0.372480092
## 144 -3.55621943 -1.28474254  0.204261455
## 246  6.13934516  2.94260609 -0.729903440
## 394 -2.62706294 -1.16845048  2.551752748
## 433 -2.53832627 -0.92913570  0.690780915
## 149 -3.59681795 -1.13460578  0.111553126
## 52   4.44276914  2.26322033 -0.878662472
## 440 -2.63936778 -0.99965965  2.598856463
## 183 -2.29928809 -1.42734827  1.337036504
## 167 -3.40276975 -2.96778774  1.082824805
## 106 -2.74025669 -0.70373308  1.119644196
## 475 -1.89649746 -0.45832632  2.520478602
## 487 -0.17252410 -0.38550544  2.549938894
## 265 -3.04833986 -1.37783632  0.165672076
## 410 -3.68012196 -2.30685175  1.707635474
## 182 -2.55591561 -0.77745675  1.626789684
## 174 -1.82370681 -0.61062680  0.914246362
## 420 -2.67891142 -1.45653909 -0.284977730
## 363 -2.98562694 -0.55356760  2.950425128
## 281  0.91052452 -0.49970467  0.224310513
## 489 -3.00181839 -0.75704572  0.208760583
## 225 -1.76329619 -2.28998851  1.565291776
## 4    2.83515884  0.72919343  0.500771006
## 66   4.48245496  4.00489106 -3.230619828
## 364 -1.30631641 -3.22756138 -2.857446111
## 414 -3.59186301 -0.80044017  1.975840576
## 460 -1.57293335 -0.79809446  2.656674783
## 214 -0.11618997 -0.05802293  0.780929280
## 345  3.88756330  2.60370382 -2.010040814
## 451 -2.38783782 -1.20360495 -0.405012205
## 384 -3.01505761 -1.13171630  2.718086947
## 195  3.62063383  2.91017887 -2.635087150
## 432 -2.39697047 -1.50425325  0.321996221
## 203  3.72381757  2.57007929 -3.235697053
## 94  -0.36261953  0.74368143 -0.315537856
## 219  0.12696490 -3.04990027 -4.516308303
## 468 -1.71225748 -1.03984461  2.376680585
## 404 -2.75063793 -0.64273433  2.592156175
## 292  1.75129015  2.17555161 -2.236434689
## 458 -1.99098790 -0.38947646 -0.183850563
## 455 -2.16895994 -1.20365284 -0.330703927
## 190  1.23760134  1.04527390 -1.213084490
## 85   1.29510389  0.54861469  0.473923746
## 169 -2.82569609 -1.25636774 -0.094005248
## 391 -2.64038443 -0.91143538  2.723443479
## 41   2.23060572  2.51115244 -2.374114686
## 501 -1.94298261 -0.44594486  0.779929021
## 322  0.82974711  0.45192471  0.715513872
## 86   0.67402683  0.20285424  0.581312659
## 40   2.27426460  2.82321264 -2.423456959
## 505 -2.46190047 -1.02918341  0.438943233
## 198  5.63268550  3.45968269 -4.141396835
## 448 -2.01014754 -1.02422984  2.264856924
## 412 -3.49578912 -1.81650228  0.331655568
## 156 -2.04236667 -3.35835041 -7.351008919
## 477 -1.73768534 -1.32189990  2.540447716
## 192  4.04526886  2.45758507 -1.724344086
## 250  5.95046040  2.50966539 -0.658567239
## 147 -4.00680144 -0.95939906 -1.294673502
## 492 -2.70208592 -1.05286259 -0.100933098
## 115 -2.25732225 -0.64680186  0.693227251
## 51   4.56025287  2.33766365 -0.882488218
## 418 -3.28645804 -0.80590331  0.938865089
## 44   2.66190792  1.26386132  0.312303997
## 70   3.90048700  2.23771427 -0.313240251
## 327  2.14900534  1.05193754  0.536650646
## 437 -2.76898851 -1.07758840 -0.268898604
## 297  3.04250109  0.71327958  0.131911992
## 252  5.20528748  2.66647393 -0.620123451
## 415 -3.31920264 -0.53779906  0.729678555
## 264 -2.49212008 -1.32094484 -0.154180813
## 82   2.28591088  1.34384625 -0.604373678
## 258 -3.28826437 -2.73757007  0.312657375
## 323  1.11258481  0.83742546  0.663856292
## 204  1.82811852  1.75922662 -2.871857594
## 249  5.84945027  2.52500475 -0.765602602
## 506 -2.09695413 -0.15777771  0.226291542
## 103 -2.34725611 -0.40656734 -1.327516795
## 23   0.26421968  0.06776781  0.367326419
## 317  0.24145906  0.13512209  0.330044911
## 411 -4.20531021 -0.85636523  0.442220810
## 321  0.83237791  0.39524807  0.727439956
## 462 -1.61496518 -1.00271474  2.439488293
## 93  -0.22552031  0.49191820 -0.511202531
## 300  6.34448927  3.72348837 -3.258933257
## 65   7.68843707  2.45750925 -0.940196228
## 171 -2.18358627 -0.70016313 -0.358594559
## 74   2.06764678  1.01078830  0.351561607
## 339  1.21140049  0.91191474  0.752807597
## 209  1.96143049 -2.19747725 -4.687832893
## 45   2.69034706  1.25779257  0.164812761
## 286  5.55827429  3.48515112 -2.904364104
## 22  -0.03476975  0.22365833  0.633959519
## 9    3.69452967  1.29736994 -0.749508135
## 118 -1.64039324 -0.27159867  0.741273769
## 245  5.99191517  3.14359693 -0.835835796
## 251  5.31535899  2.55098619 -0.545310449
## 379 -2.92758779 -1.74592905  2.513738901
## 155 -1.67674703 -3.52381274 -5.758430855
## 154 -3.80666297 -1.07980424 -0.560946415
## 360 -2.07604508 -0.92051577  2.715113510
## 54   4.38766871  2.45885728 -0.688711142
## 89  -0.64875286 -0.47636464  0.321024730
## 419 -3.13787632 -0.92880761 -0.088298503
## 370 -3.21578565 -4.84103769 -1.101076075
## 117 -1.62615232 -0.45506467  0.760157718
## 377 -2.99078303 -1.92901599  2.243758811
## 445 -2.65902713 -1.03516558  1.382750197
## 199  5.54554427  3.25487417 -3.739808533
## 2    1.57048394  0.50984729  0.109698826
## 36  -0.84991672  0.19173860  0.913792923
## 340  1.25400049  0.93741740  0.704377533
## 186 -1.24578346 -0.38422159  1.076796919
## 98  -0.72974539 -1.51666322  0.697983980
## 388 -2.83562742 -0.82848610  2.628903436
## 362 -2.30290293 -1.10439854  2.317094048
## 97  -0.55698293  0.08915451  0.497508923
## 405 -2.77086361 -0.98338269  2.065198505
## 211  1.48775402 -2.44719309 -4.624554439
## 68   3.87073888  2.27953358 -0.263061375
## 216 -0.07217896  0.11794292  0.759099179
## 17   0.53007438  0.85579014  0.690517365
## 14   0.91181180  0.83575373  0.578380613
## 350  7.33575636  3.46606660 -2.384503377
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p1 <- plot_ly(x = matrix_product$LD1, 
        y = matrix_product$LD2, 
        z = matrix_product$LD3, 
        type= 'scatter3d', 
        mode='markers', 
        color = train$crime,
        size = 2)
p1
model_predictors <- dplyr::select(train, -crime)
train.km <- kmeans(model_predictors, centers = 3) 


p2 <- plot_ly(x = matrix_product$LD1, 
        y = matrix_product$LD2, 
        z = matrix_product$LD3, 
        type= 'scatter3d', 
        mode='markers', 
        color = factor(train.km$cluster),
        size = 1.5)
p2
train.crime3 <- train %>% 
  mutate(crime3 = crime %>% 
           fct_recode("Medium" = "MediumHigh" ,
                      "Low" = "MediumLow",
                      "High" = "High",
                      "Low" = "Low"))

km.cluster <- factor(train.km$cluster)
levels(km.cluster) <- c("Medium","Low","High")

accuracy.tab <- table(correct = train.crime3$crime3, kmean.pred = km.cluster)
accuracy.tab
##         kmean.pred
## correct  Medium Low High
##   Low        10 187   15
##   Medium     10  48   37
##   High        5   0   92
correct.n = 0
for (i in 1:3){
  correct.c <- accuracy.tab[which(rownames(accuracy.tab) == colnames(accuracy.tab)[i]), i]
  correct.n = correct.c+ correct.n
} 
correct.n
## [1] 289
correct.n/(nrow(bos.s)*0.8)
## [1] 0.7139328
model_predictors <- dplyr::select(test, -crime)
test.km <- kmeans(model_predictors, centers = 3) 
test.crime3 <- test %>% 
  mutate(crime3 = crime %>% 
           fct_recode("Medium" = "MediumHigh" ,
                      "Low" = "MediumLow",
                      "High" = "High",
                      "Low" = "Low"))

km.cluster <- factor(test.km$cluster)
levels(km.cluster) <- c("Medium","Low","High")

accuracy.tab <- table(correct = test.crime3$crime3, kmean.pred = km.cluster)
accuracy.tab
##         kmean.pred
## correct  Medium Low High
##   Low        13  27    1
##   Medium     19  10    2
##   High        1   0   29
correct.n = 0
for (i in 1:3){
  correct.c <- accuracy.tab[which(rownames(accuracy.tab) == colnames(accuracy.tab)[i]), i]
  correct.n = correct.c+ correct.n
} 
correct.n
## [1] 75
correct.n/(nrow(bos.s)*0.2)
## [1] 0.7411067